In [23]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.feature_selection import RFE
from sklearn.linear_model import LassoCV
In [24]:
# Load the dataset (update the path to your dataset)
data = pd.read_csv("C:\\Users\\dongr\\OneDrive\\Desktop\\new proj\\diabetes.csv")
# Display the first few rows of the dataset
data.head()
Out[24]:
| id | chol | stab.glu | hdl | ratio | glyhb | location | age | gender | height | weight | frame | bp.1s | bp.1d | bp.2s | bp.2d | waist | hip | time.ppn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000 | 203.0 | 82 | 56.0 | 3.6 | 4.31 | Buckingham | 46 | female | 62.0 | 121.0 | medium | 118.0 | 59.0 | NaN | NaN | 29.0 | 38.0 | 720.0 |
| 1 | 1001 | 165.0 | 97 | 24.0 | 6.9 | 4.44 | Buckingham | 29 | female | 64.0 | 218.0 | large | 112.0 | 68.0 | NaN | NaN | 46.0 | 48.0 | 360.0 |
| 2 | 1002 | 228.0 | 92 | 37.0 | 6.2 | 4.64 | Buckingham | 58 | female | 61.0 | 256.0 | large | 190.0 | 92.0 | 185.0 | 92.0 | 49.0 | 57.0 | 180.0 |
| 3 | 1003 | 78.0 | 93 | 12.0 | 6.5 | 4.63 | Buckingham | 67 | male | 67.0 | 119.0 | large | 110.0 | 50.0 | NaN | NaN | 33.0 | 38.0 | 480.0 |
| 4 | 1005 | 249.0 | 90 | 28.0 | 8.9 | 7.72 | Buckingham | 64 | male | 68.0 | 183.0 | medium | 138.0 | 80.0 | NaN | NaN | 44.0 | 41.0 | 300.0 |
In [25]:
# Get basic information about the dataset
print(data.shape) # Number of rows and columns
print(data.info()) # Data types and non-null counts
print(data.describe()) # Summary statistics
# Check for missing values
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values[missing_values > 0])
(403, 19)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403 entries, 0 to 402
Data columns (total 19 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 403 non-null int64
1 chol 402 non-null float64
2 stab.glu 403 non-null int64
3 hdl 402 non-null float64
4 ratio 402 non-null float64
5 glyhb 390 non-null float64
6 location 403 non-null object
7 age 403 non-null int64
8 gender 403 non-null object
9 height 398 non-null float64
10 weight 402 non-null float64
11 frame 391 non-null object
12 bp.1s 398 non-null float64
13 bp.1d 398 non-null float64
14 bp.2s 141 non-null float64
15 bp.2d 141 non-null float64
16 waist 401 non-null float64
17 hip 401 non-null float64
18 time.ppn 400 non-null float64
dtypes: float64(13), int64(3), object(3)
memory usage: 59.9+ KB
None
id chol stab.glu hdl ratio \
count 403.000000 402.000000 403.000000 402.000000 402.000000
mean 15978.310174 207.845771 106.672457 50.445274 4.521642
std 11881.122124 44.445557 53.076655 17.262626 1.727886
min 1000.000000 78.000000 48.000000 12.000000 1.500000
25% 4792.500000 179.000000 81.000000 38.000000 3.200000
50% 15766.000000 204.000000 89.000000 46.000000 4.200000
75% 20336.000000 230.000000 106.000000 59.000000 5.400000
max 41756.000000 443.000000 385.000000 120.000000 19.299999
glyhb age height weight bp.1s bp.1d \
count 390.000000 403.000000 398.000000 402.000000 398.000000 398.000000
mean 5.589769 46.851117 66.020101 177.592040 136.904523 83.321608
std 2.242595 16.312333 3.918515 40.340666 22.741033 13.589227
min 2.680000 19.000000 52.000000 99.000000 90.000000 48.000000
25% 4.380000 34.000000 63.000000 151.000000 121.250000 75.000000
50% 4.840000 45.000000 66.000000 172.500000 136.000000 82.000000
75% 5.600000 60.000000 69.000000 200.000000 146.750000 90.000000
max 16.110001 92.000000 76.000000 325.000000 250.000000 124.000000
bp.2s bp.2d waist hip time.ppn
count 141.000000 141.000000 401.000000 401.000000 400.000000
mean 152.382979 92.524823 37.900249 43.039900 341.250000
std 21.712952 11.555198 5.729313 5.656713 309.540953
min 110.000000 60.000000 26.000000 30.000000 5.000000
25% 138.000000 84.000000 33.000000 39.000000 90.000000
50% 149.000000 92.000000 37.000000 42.000000 240.000000
75% 161.000000 100.000000 41.000000 46.000000 517.500000
max 238.000000 124.000000 56.000000 64.000000 1560.000000
Missing Values:
chol 1
hdl 1
ratio 1
glyhb 13
height 5
weight 1
frame 12
bp.1s 5
bp.1d 5
bp.2s 262
bp.2d 262
waist 2
hip 2
time.ppn 3
dtype: int64
In [26]:
non_numeric_columns = data.select_dtypes(exclude=[np.number]).columns
print(non_numeric_columns)
Index(['location', 'gender', 'frame'], dtype='object')
In [27]:
data[non_numeric_columns] = data[non_numeric_columns].apply(pd.to_numeric, errors='coerce')
In [28]:
data.drop(columns=non_numeric_columns, inplace=True)
In [29]:
# Fill missing values or drop rows/columns (example filling with mean)
data.fillna(data.mean(), inplace=True)
In [30]:
# Example: if you have Weight and Height, calculate BMI (adjust according to your dataset)
if 'Weight' in data.columns and 'Height' in data.columns:
data['BMI'] = data['Weight'] / (data['Height'] ** 2)
In [31]:
# Create a list of numerical features to scale
numerical_features = ['height', 'weight']
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])
In [32]:
# Example: Histogram of a feature
sns.histplot(data['age'], bins=30)
plt.title('Distribution of age')
plt.xlabel('age')
plt.ylabel('Frequency')
plt.show()
In [33]:
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()
In [34]:
# visualizing through abscatterplot diagram
sns.scatterplot(data=data, x='chol', y='hdl', hue='age')
plt.title('Cholesterol vs HDL with Age as Hue')
plt.xlabel('Cholesterol')
plt.ylabel('HDL')
plt.legend(title='Age')
plt.show()
In [35]:
#pair plot visualization
sns.pairplot(data, hue='age')
plt.show()
In [64]:
# Define features and target variable
features = data.columns.tolist() # Adjust based on your dataset and selected features
target = 'age'
X = data[features]
y = data[target]
# Apply RFE
model = RandomForestClassifier()
selector = RFE(model, n_features_to_select=5) # Adjust number as needed
selector = selector.fit(X, y)
# Store selected features
selected_features_rfe = [features[i] for i in range(len(features)) if selector.support_[i]]
print("Selected Features using RFE: ", selected_features_rfe)
Selected Features using RFE: ['id', 'chol', 'hdl', 'glyhb', 'age']
In [65]:
# Define models
models = {
'SVM': SVC(probability=True, random_state=42),
'Decision Tree': DecisionTreeClassifier(random_state=42),
'Random Forest': RandomForestClassifier(random_state=42),
'xgboost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}
In [66]:
# Feature Scaling
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data.drop('age',axis=1))
data_scaled = pd.DataFrame(data_scaled, columns=data.columns[:-1])
In [60]:
X = data_scaled
y = data['age']
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000) # Ensure convergence
# Initialize RFE with the model and specify the number of features to select
rfe = RFE(estimator=model, n_features_to_select=5) # Use keyword argument for n_features_to_select
# Fit RFE
fit = rfe.fit(X, y)
# Output selected features
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)
Selected Features: [False False False True True True False True True False False False False False False] Feature Ranking: [ 4 10 7 1 1 1 9 1 1 5 6 3 2 8 11]
In [62]:
X_train, X_test, y_train, y_test = train_test_split(data[selected_features_rfe], data[target], test_size=0.2, random_state=42)
In [86]:
# Create a dictionary of models
models = {
"SVM": SVC(probability=True),
"Decision Tree": DecisionTreeClassifier(),
"Random Forest": RandomForestClassifier()
}
# Fit each model to the training data and print a success message
for name, model in models.items():
model.fit(X_train, y_train)
print(f"{name} trained successfully.")
SVM trained successfully. Decision Tree trained successfully. Random Forest trained successfully.
In [88]:
for name, model in models.items():
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print(f"{name} Accuracy: {scores.mean():.2f} +/- {scores.std():.2f}")
C:\Users\dongr\anaconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn(
SVM Accuracy: 0.03 +/- 0.01 Decision Tree Accuracy: 0.91 +/- 0.03
C:\Users\dongr\anaconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn( C:\Users\dongr\anaconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn(
Random Forest Accuracy: 0.53 +/- 0.03